This code covers chapter 8 of “Introduction to Data Mining” by Pang-Ning Tan, Michael Steinbach and Vipin Kumar. See table of contents for code examples for other chapters.
This work is licensed under the Creative Commons Attribution 4.0 International License. For questions please contact Michael Hahsler.
ruspini_scaled data is in package cluster. It is a very simple data set with well separated clusters.
data(ruspini, package="cluster")
Shuffle rows
ruspini <- ruspini[sample(1:nrow(ruspini)),]
plot(ruspini)
Scale each column in the data to zero mean and unit standard deviation (z-scores). This prevents one attribute with a large range to dominate the others for the distance calculation.
ruspini_scaled <- scale(ruspini)
plot(ruspini_scaled)
Assumes Euclidean distances. We use k=10 clusters and run the algorithm 10 times with random initialized centroids. The best result is returned.
km <- kmeans(ruspini_scaled, centers=4, nstart=10)
km
## K-means clustering with 4 clusters of sizes 23, 20, 17, 15
##
## Cluster means:
## x y
## 1 -0.3595425 1.1091151
## 2 -1.1385941 -0.5559591
## 3 1.4194387 0.4692907
## 4 0.4607268 -1.4912271
##
## Clustering vector:
## 57 19 33 62 52 28 37 13 74 39 61 43 55 35 70 49 17 3 47 30 36 12 2 25 27
## 3 2 1 4 3 1 1 2 4 1 4 1 3 1 4 3 2 2 3 1 1 2 2 1 1
## 16 65 24 26 42 8 64 66 11 22 72 44 68 51 63 15 50 18 4 14 67 32 59 48 45
## 2 4 1 1 1 2 4 4 2 1 4 3 4 3 4 2 3 2 2 2 4 1 3 3 3
## 75 56 53 23 31 9 6 41 46 54 69 34 58 40 71 10 38 73 20 5 29 7 1 21 60
## 4 3 3 1 1 2 2 1 3 3 4 1 3 1 4 2 1 4 2 2 1 2 2 1 3
##
## Within cluster sum of squares by cluster:
## [1] 2.658679 2.705477 3.641276 1.082373
## (between_SS / total_SS = 93.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(ruspini_scaled, col=km$cluster)
points(km$centers, pch=3, cex=2) # this adds the centroids
text(km$centers, labels=1:4, pos=2) # this adds the cluster ID
Alternative plot from package cluster (uses principal components analysis for >2 dimensions)
library(cluster)
##
## Attaching package: 'cluster'
## The following object is masked _by_ '.GlobalEnv':
##
## ruspini
clusplot(ruspini_scaled, km$cluster)
Inspect the centroids (cluster profiles)
km$centers
## x y
## 1 -0.3595425 1.1091151
## 2 -1.1385941 -0.5559591
## 3 1.4194387 0.4692907
## 4 0.4607268 -1.4912271
def.par <- par(no.readonly = TRUE) # save default, for resetting...
layout(t(1:4)) # 4 plots in one
for(i in 1:4) barplot(km$centers[i,], ylim=c(-2,2), main=paste("Cluster", i))
par(def.par) #- reset to default
Find data for a single cluster
All you need is to select the rows corresponding to the cluster. The next example plots all data points of cluster 1
cluster1 <- ruspini_scaled[km$cluster==1,]
head(cluster1)
## x y
## 33 -0.3566917 1.0466240
## 28 -0.5533967 1.0466240
## 37 -0.1599867 1.0260913
## 39 -0.0944184 1.2314190
## 43 0.2662074 0.9644929
## 35 -0.2583392 1.1698207
plot(cluster1, xlim = c(-2,2), ylim = c(-2,2))
Try 10 clusters
plot(ruspini_scaled, col=kmeans(ruspini_scaled, centers=10)$cluster)
dist defaults to method=“Euclidean”
d <- dist(ruspini_scaled)
We cluster using complete link
hc <- hclust(d, method="complete")
Dendrogram
plot(hc)
rect.hclust(hc, k=4)
plot(as.dendrogram(hc), leaflab="none") # plot dendrogram without leaf labels
More plotting options for dendrograms, including plotting parts of large dendrograms can be found here.
cluster_complete <- cutree(hc, k=4)
plot(ruspini_scaled, col=cluster_complete)
Try 10 clusters
plot(ruspini_scaled, col=cutree(hc, k=10))
Clustering with single link
hc_single <- hclust(d, method="single")
plot(hc_single)
rect.hclust(hc_single, k=4)
cluster_single <- cutree(hc_single, k=4)
plot(ruspini_scaled, col=cluster_single)
Try 10 clusters
plot(ruspini_scaled, col=cutree(hc_single, k=10))
library(dbscan)
Parameters: minPts is often chosen as dimensionality of the data +1. Decide on epsilon using the knee in the kNN distance plot (seems to be around eps = .25).
kNNdistplot(ruspini_scaled, k = 3)
abline(h=.25, col="red")
run dbscan
db <- dbscan(ruspini_scaled, eps=.25, minPts=3)
db
## DBSCAN clustering for 75 objects.
## Parameters: eps = 0.25, minPts = 3
## The clustering contains 5 cluster(s) and 6 noise points.
##
## 0 1 2 3 4 5
## 6 12 19 20 15 3
##
## Available fields: cluster, eps, minPts
str(db)
## List of 3
## $ cluster: int [1:75] 1 2 3 4 1 3 3 2 4 3 ...
## $ eps : num 0.25
## $ minPts : num 3
## - attr(*, "class")= chr [1:2] "dbscan_fast" "dbscan"
plot(ruspini_scaled, col=db$cluster+1L)
Note: 0 is not a color so we add 1 to cluster (0 is black now).
Alternative visualization from package dbscan
hullplot(ruspini_scaled, db)
Play with eps (neighborhood size) and MinPts (minimum of points needed for core cluster)
library(mclust)
## Package 'mclust' version 5.3
## Type 'citation("mclust")' for citing this R package in publications.
Mclust uses Bayesian Information Criterion (BIC) to find the number of clusters (model selection). BIC uses the likelihood and a penalty term to guard against overfitting.
m <- Mclust(ruspini_scaled)
summary(m)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEI (diagonal, equal volume and shape) model with 5 components:
##
## log.likelihood n df BIC ICL
## -91.26485 75 16 -251.6095 -251.7486
##
## Clustering table:
## 1 2 3 4 5
## 14 20 23 15 3
plot(m, what = "classification")
Rerun with a fixed number of 4 clusters
m <- Mclust(ruspini_scaled, G=4)
summary(m)
## ----------------------------------------------------
## Gaussian finite mixture model fitted by EM algorithm
## ----------------------------------------------------
##
## Mclust EEI (diagonal, equal volume and shape) model with 4 components:
##
## log.likelihood n df BIC ICL
## -101.6027 75 13 -259.3327 -259.3356
##
## Clustering table:
## 1 2 3 4
## 17 20 23 15
plot(m, what = "classification")
library("kernlab")
cluster_spec <- specc(ruspini_scaled, centers = 4)
plot(ruspini_scaled, col=cluster_spec)
Look at the within.cluster.ss and the avg.silwidth
#library(fpc)
Note: I do not load fpc since the NAMESPACE overwrites dbscan.
fpc::cluster.stats(d, km$cluster)
## $n
## [1] 75
##
## $cluster.number
## [1] 4
##
## $cluster.size
## [1] 23 20 17 15
##
## $min.cluster.size
## [1] 15
##
## $noisen
## [1] 0
##
## $diameter
## [1] 1.1591436 1.1192822 1.4627043 0.8359025
##
## $average.distance
## [1] 0.4286139 0.4824376 0.5805551 0.3564457
##
## $median.distance
## [1] 0.3934100 0.4492432 0.5023855 0.3379729
##
## $separation
## [1] 0.767612 1.157682 0.767612 1.157682
##
## $average.toother
## [1] 2.148527 2.157193 2.293318 2.307969
##
## $separation.matrix
## [,1] [,2] [,3] [,4]
## [1,] 0.000000 1.219930 0.767612 1.957726
## [2,] 1.219930 0.000000 1.339721 1.157682
## [3,] 0.767612 1.339721 0.000000 1.308435
## [4,] 1.957726 1.157682 1.308435 0.000000
##
## $ave.between.matrix
## [,1] [,2] [,3] [,4]
## [1,] 0.000000 1.887363 1.924915 2.750174
## [2,] 1.887363 0.000000 2.771960 1.874198
## [3,] 1.924915 2.771960 0.000000 2.220011
## [4,] 2.750174 1.874198 2.220011 0.000000
##
## $average.between
## [1] 2.219257
##
## $average.within
## [1] 0.462697
##
## $n.between
## [1] 2091
##
## $n.within
## [1] 684
##
## $max.diameter
## [1] 1.462704
##
## $min.separation
## [1] 0.767612
##
## $within.cluster.ss
## [1] 10.08781
##
## $clus.avg.silwidths
## 1 2 3 4
## 0.7454551 0.7211353 0.6812849 0.8073733
##
## $avg.silwidth
## [1] 0.7368082
##
## $g2
## NULL
##
## $g3
## NULL
##
## $pearsongamma
## [1] 0.8415597
##
## $dunn
## [1] 0.5247896
##
## $dunn2
## [1] 3.228286
##
## $entropy
## [1] 1.37327
##
## $wb.ratio
## [1] 0.2084918
##
## $ch
## [1] 323.5512
##
## $cwidegap
## [1] 0.3152817 0.2611701 0.4149825 0.2351854
##
## $widestgap
## [1] 0.4149825
##
## $sindex
## [1] 0.8583457
##
## $corrected.rand
## NULL
##
## $vi
## NULL
#cluster.stats(d, cluster_complete)
#cluster.stats(d, cluster_single)
Read ? cluster.stats for an explanation of all the available indices.
sapply(list(
km=km$cluster,
hc_compl=cluster_complete,
hc_single=cluster_single),
FUN=function(x)
fpc::cluster.stats(d, x))[c("within.cluster.ss","avg.silwidth"),]
## km hc_compl hc_single
## within.cluster.ss 10.08781 10.08781 10.08781
## avg.silwidth 0.7368082 0.7368082 0.7368082
library(cluster)
plot(silhouette(km$cluster, d))
Note: The silhouette plot does not show correctly in R Studio if you have too many objects (bars are missing). I will work when you open a new plotting device with windows(), x11() or quartz().
plot(ruspini_scaled)
set.seed(1234)
ks <- 2:10
Use within sum of squares and look for the knee (nstart=5 repeats k-means 5 times and returns the best solution)
WSS <- sapply(ks, FUN=function(k) {
kmeans(ruspini_scaled, centers=k, nstart=5)$tot.withinss
})
plot(ks, WSS, type="l")
abline(v=4, col="red", lty=2)
Use average silhouette width (look for the max)
ASW <- sapply(ks, FUN=function(k) {
fpc::cluster.stats(d, kmeans(ruspini_scaled, centers=k, nstart=5)$cluster)$avg.silwidth
})
plot(ks, ASW, type="l")
ks[which.max(ASW)]
## [1] 4
abline(v=ks[which.max(ASW)], col="red", lty=2)
Use Dunn index (another internal measure given by min. separation/ max. diameter)
DI <- sapply(ks, FUN=function(k) {
fpc::cluster.stats(d, kmeans(ruspini_scaled, centers=k, nstart=5)$cluster)$dunn
})
plot(ks, DI, type="l")
ks[which.max(DI)]
## [1] 4
abline(v=ks[which.max(DI)], col="red", lty=2)
Compares the change in within-cluster dispersion with that expected from a null model (see ? clusGap). The default method is to choose the smallest k such that its value Gap(k) is not more than 1 standard error away from the first local maximum.
library(cluster)
k <- clusGap(ruspini_scaled, FUN = kmeans, nstart = 10, K.max = 10)
k
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = ruspini_scaled, FUNcluster = kmeans, K.max = 10, nstart = 10)
## B=100 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 4
## logW E.logW gap SE.sim
## [1,] 3.497911 3.474301 -0.02360980 0.03835335
## [2,] 3.073937 3.157027 0.08309024 0.04005118
## [3,] 2.678004 2.908822 0.23081785 0.03767224
## [4,] 2.106416 2.710845 0.60442835 0.03725234
## [5,] 1.987213 2.571981 0.58476735 0.03699570
## [6,] 1.863936 2.455285 0.59134824 0.03851437
## [7,] 1.732234 2.350146 0.61791223 0.03754122
## [8,] 1.640492 2.257624 0.61713194 0.03782124
## [9,] 1.579299 2.172410 0.59311149 0.03841206
## [10,] 1.524184 2.094176 0.56999210 0.03782182
plot(k)
Note: these methods can also be used for hierarchical clustering.
There have been many other methods and indices proposed to determine the number of clusters. See, e.g., package NbClust.
Visualizing the unordered distance matrix does not show much structure.
plot(ruspini_scaled)
d <- dist(ruspini_scaled)
library(seriation)
pimage(d)
Reorder using cluster labels
pimage(d, order=order(km$cluster))
Use dissplot which rearranges clusters, adds cluster labels, and shows average dissimilarity in the lower half of the plot.
dissplot(d, labels=km$cluster, options=list(main="k-means with k=4"))
dissplot(d, labels=db$cluster+1, options=list(main="DBSCAN"))
Spot the problem data points for DBSCAN (we use +1 so the noise is now cluster #1)
Misspecified k
dissplot(d, labels=kmeans(ruspini_scaled, centers=3)$cluster)
dissplot(d, labels=kmeans(ruspini_scaled, centers=9)$cluster)
External cluster validation uses ground truth information. That is, the user has an idea how the data should be grouped. This could be a know class label not provided to the clustering algorithm.
We use an artificial data set with known groups here. First, we need to cluster the new data. We do k-means and hierarchical clustering.
library(mlbench)
shapes <- mlbench.smiley(n=500, sd1 = 0.1, sd2 = 0.05)
plot(shapes)
Prepare data
truth <- as.integer(shapes$class)
shapes <- scale(shapes$x)
plot(shapes)
Find optimal number of Clusters for k-means
ks <- 2:20
Use within sum of squares (look for the knee)
WSS <- sapply(ks, FUN=function(k) {
kmeans(shapes, centers=k, nstart=10)$tot.withinss
})
plot(ks, WSS, type="l")
looks like 6 clusters
km <- kmeans(shapes, centers=6, nstart = 10)
plot(shapes, col=km$cluster)
Hierarchical clustering: single-link because of the mouth
d <- dist(shapes)
hc <- hclust(d, method="single")
Find optimal number of clusters
ASW <- sapply(ks, FUN=function(k) {
fpc::cluster.stats(d, cutree(hc, k))$avg.silwidth
})
plot(ks, ASW, type="l")
4 clusters
hc_4 <- cutree(hc, 4)
plot(shapes, col=hc_4)
Spectral Clustering
library("kernlab")
spec <- specc(shapes, centers = 4)
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 25000)
plot(shapes, col=spec)
Compare with ground truth with the corrected (=adjusted) Rand index (ARI), the variation of information (VI) index, entropy and purity.
#
define entropy and purity
entropy <- function(cluster, truth) {
k <- max(cluster, truth)
cluster <- factor(cluster, levels = 1:k)
truth <- factor(truth, levels = 1:k)
m <- length(cluster)
mi <- table(cluster)
cnts <- split(truth, cluster)
cnts <- sapply(cnts, FUN = function(n) table(n))
p <- sweep(cnts, 1, rowSums(cnts), "/")
p[is.nan(p)] <- 0
e <- -p * log(p, 2)
sum(rowSums(e, na.rm = TRUE) * mi/m)
}
purity <- function(cluster, truth) {
k <- max(cluster, truth)
cluster <- factor(cluster, levels = 1:k)
truth <- factor(truth, levels = 1:k)
m <- length(cluster)
mi <- table(cluster)
cnts <- split(truth, cluster)
cnts <- sapply(cnts, FUN = function(n) table(n))
p <- sweep(cnts, 1, rowSums(cnts), "/")
p[is.nan(p)] <- 0
sum(apply(p, 1, max) * mi/m)
}
calculate measures (for comparison we also use random “clusterings” with 4 and 6 clusters)
random4 <- sample(1:4, nrow(shapes), replace = TRUE)
random6 <- sample(1:6, nrow(shapes), replace = TRUE)
r <- rbind(
kmeans = c(
unlist(fpc::cluster.stats(d, km$cluster, truth, compareonly = TRUE)),
entropy = entropy(km$cluster, truth),
purity = purity(km$cluster, truth)
),
hc = c(
unlist(fpc::cluster.stats(d, hc_4, truth, compareonly = TRUE)),
entropy = entropy(hc_4, truth),
purity = purity(hc_4, truth)
),
spec = c(
unlist(fpc::cluster.stats(d, spec, truth, compareonly = TRUE)),
entropy = entropy(spec, truth),
purity = purity(spec, truth)
),
random4 = c(
unlist(fpc::cluster.stats(d, random4, truth, compareonly = TRUE)),
entropy = entropy(random4, truth),
purity = purity(random4, truth)
),
random6 = c(
unlist(fpc::cluster.stats(d, random6, truth, compareonly = TRUE)),
entropy = entropy(random6, truth),
purity = purity(random6, truth)
)
)
r
## corrected.rand vi entropy purity
## kmeans 0.695694371 0.4382154 0.2631692 0.4844306
## hc 1.000000000 0.0000000 0.0000000 1.0000000
## spec 0.548508174 0.5566220 0.6673703 0.6780723
## random4 0.003296024 2.6665655 1.9788525 0.2923809
## random6 -0.002615134 3.0773033 1.6964185 0.1432732
Hierarchical clustering found the perfect clustering.
Read ? cluster.stats for an explanation of all the available indices.